haploRILs - Statistical analysis of simbased benchmarking analysis

jmontero

2024-10-14

Load required packages, install if required

commonly_used_packages = c("data.table", "dplyr", "data.table", "ggplot2", "tidyr")
new_packages = c("patchwork", "babynames", "ggrepel", "DT") # Introduce other packages to import
pkgs = c(commonly_used_packages, new_packages)
for (pkg in pkgs) { if (!suppressMessages(require(pkg, character.only = TRUE, quietly = TRUE))) { install.packages(pkg, character.only = TRUE, repos = "https://cran.uni-muenster.de/", quiet = TRUE) ; require(pkg, character.only = TRUE, quietly = TRUE) } }

Import metrics

metrics = fread("metrics.txt")
# Convert parameters to factors
metrics = metrics %>% dplyr::mutate(dplyr::across(c(GE_rate, n_founders, nSnp, step, K), as.factor))

Metrics distribution

print(summary(metrics))
 n_founders GE_rate  nSnp     step    K        co_detected      co_simulated  
 16 :177    0: 77   12 : 87   1:158   1:123   Min.   :  1.00   Min.   :2.000  
 100:154    1:120   48 :113   2:173   2:106   1st Qu.:  2.00   1st Qu.:3.000  
            2:134   120:131           3:102   Median :  6.00   Median :5.000  
                                              Mean   : 15.31   Mean   :4.946  
                                              3rd Qu.: 20.50   3rd Qu.:7.000  
                                              Max.   :109.00   Max.   :9.000  
      prc               rcl              f1s               res        
 Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :  5.00  
 1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.: 12.00  
 Median :0.03279   Median :0.1667   Median :0.05882   Median : 48.00  
 Mean   :0.21236   Mean   :0.2425   Mean   :0.15693   Mean   : 49.44  
 3rd Qu.:0.28571   3rd Qu.:0.4444   3rd Qu.:0.29670   3rd Qu.: 60.00  
 Max.   :1.00000   Max.   :1.0000   Max.   :0.80000   Max.   :120.00  
metrics %>% dplyr::select(co_detected, co_simulated) %>% boxplot(horizontal = TRUE)

metrics %>% dplyr::select(prc, rcl, f1s) %>% boxplot(horizontal = TRUE)

metrics %>% dplyr::select(res) %>% boxplot(horizontal = TRUE)

Calculate average metrics

metrics = metrics %>% arrange(GE_rate, n_founders, nSnp, step, K, f1s)
avr_metrics =
  metrics %>%
  dplyr::group_by(GE_rate, n_founders, nSnp, step, K) %>%
  summarise(co_detected = mean(co_detected, na.rm = TRUE), co_simulated = mean(co_simulated, na.rm = TRUE), prc = mean(prc, na.rm = TRUE), rcl = mean(rcl, na.rm = TRUE), f1s = mean(f1s, na.rm = TRUE), res = mean(res, na.rm = TRUE), .groups = "drop") %>%
  select(GE_rate, n_founders, nSnp, step, K, co_detected, co_simulated, prc, rcl, f1s, res) %>%
  arrange(GE_rate, n_founders, nSnp, step, K, co_detected, co_simulated, prc, rcl, f1s, res) %>%
  ungroup()
dt_metrics = metrics %>% dplyr::mutate(dplyr::across(c(co_detected, co_simulated, prc, rcl, f1s, res), ~ round(., 2)))
# Display the interactive table
datatable(dt_metrics, options = list(
  pageLength = 5, # Number of rows per page
  autoWidth = TRUE,
  scrollX = TRUE,
  rownames = FALSE
))

Define function to plot precision vs. recall by GE rate

# Edit table (prc, rcl and f1s must be in perc)
avr_metrics = avr_metrics %>% dplyr::mutate(across(c("prc", "rcl", "f1s"), ~ .x*100))
# Add dot label column
avr_metrics$label = paste0("F1=", round(avr_metrics$f1s, 1), "%-", "W{", avr_metrics$nSnp, ":", avr_metrics$step, "}")

# Function
plotMetrics = function(avr_metrics, K_value) {
  # Filter by K
  filtered_df = avr_metrics %>% dplyr::filter(K == K_value)
  top10perGErate = filtered_df %>% group_by(GE_rate) %>% slice_max(f1s, n = 10)
  filtered_df$label = ifelse(filtered_df$f1s %in% top10perGErate$f1s, filtered_df$label, "")
  
  plot =
    ggplot(filtered_df, aes(x = rcl, y = prc)) +
    
    # First geom_point layer for nSnp, n_founders, and step with separate legend entries
    geom_point(aes(size = as.factor(nSnp),
                   color = as.factor(n_founders),
                   shape = as.factor(step)),
               alpha = 0.5,
               show.legend = TRUE) +
    scale_x_continuous("recall %", limits = c(0, 100)) +
    scale_y_continuous("precision %", limits = c(0, 100)) +
    scale_size_discrete(name = "Window Size (#SNPs)", range = c(3, 6)) +
    scale_color_discrete(name = "#Founders") +
    scale_shape_discrete(name = "Step (#SNPs)") +
    scale_fill_viridis_d(name = "K", option = "D") + # Use a discrete viridis color scale for K
    
    # Separate legends for each aesthetic
    guides(
      color = guide_legend(order = 1, title = "#Founders"), # First legend for n_founders
      shape = guide_legend(order = 2, title = "Step (#SNPs)"), # Second legend for step
      size = guide_legend(order = 3, title = "Window Size (#SNPs)"), # Third legend for nSnp
      fill = guide_legend(order = 4, title = "K") # Fourth legend for K
    ) +
    theme_light() +
    geom_text_repel(aes(label = label),
                    size = 2.5,
                    segment.alpha = 0.4,
                    segment.color = 'grey',
                    max.overlaps = 10000,
                    vjust = 1.5) + # Adjust vjust to increase vertical spacing
    theme(
      plot.title = element_text(lineheight = .8, size = 15),
      legend.title = element_text(size = 8),
      legend.position = "top",
      legend.background = element_rect(colour = "white"),
      legend.text = element_text(size = 8),
      axis.text.x = element_text(size = 10),
      axis.text.y = element_text(size = 10),
      axis.title.x = element_text(size = 20),
      axis.title.y = element_text(size = 20),
      strip.text = element_text(size = 20, colour = "black"),
      strip.background = element_blank(),
      strip.placement = "outside"
    ) +
    facet_wrap(vars(GE_rate), nrow = 2, ncol = 5, strip.position = "top") +
    ggtitle(paste0("haploRILs performance under K = ", K_value)) # Add title for K
  return(plot)
}

Influence of parameter combinations on haploRILs performance

plot_K1 = plotMetrics(avr_metrics, K_value = 1)
plot_K2 = plotMetrics(avr_metrics, K_value = 2)
plot_K3 = plotMetrics(avr_metrics, K_value = 3)

print(plot_K1)

print(plot_K2)

print(plot_K3)

Influence of founder number on haploRILs performance

# Create individual plots
plot1 = avr_metrics %>%
  dplyr::group_by(GE_rate, n_founders) %>%
  summarise(prc = mean(prc), rcl = mean(rcl), .groups = "drop") %>%
  ungroup() %>%
  ggplot(aes(x = factor(n_founders), y = prc, fill = factor(GE_rate))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "n_founders", y = "Precision (prc)", fill = "GE Rate") +
  theme_minimal()
plot2 = avr_metrics %>%
  dplyr::group_by(GE_rate, n_founders) %>%
  summarise(prc = mean(prc), rcl = mean(rcl), .groups = "drop") %>%
  ungroup() %>%
  ggplot(aes(x = factor(n_founders), y = rcl, fill = factor(GE_rate))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "n_founders", y = "Recall (rcl)", fill = "GE Rate") +
  theme_minimal()
print(plot1)

print(plot2)

Influence of individual parameters on haploRILs performance (16 founders)

# Create individual plots
plot_K_prc = avr_metrics %>%
  dplyr::filter(n_founders == 16) %>%
  dplyr::group_by(GE_rate, K) %>%
  summarise(prc = mean(prc), rcl = mean(rcl), .groups = "drop") %>%
  ungroup() %>%
  ggplot(aes(x = factor(K), y = prc, fill = factor(GE_rate))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "K", y = "Precision (prc)", fill = "GE Rate") +
  theme_minimal()
plot_nSnp_prc = avr_metrics %>%
  dplyr::filter(n_founders == 16) %>%
  dplyr::group_by(GE_rate, nSnp) %>%
  summarise(prc = mean(prc), rcl = mean(rcl), .groups = "drop") %>%
  ungroup() %>%
  ggplot(aes(x = factor(nSnp), y = prc, fill = factor(GE_rate))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "nSnp", y = "Precision (prc)", fill = "GE Rate") +
  theme_minimal()
plot_step_prc = avr_metrics %>%
  dplyr::filter(n_founders == 16) %>%
  dplyr::group_by(GE_rate, step) %>%
  summarise(prc = mean(prc), rcl = mean(rcl), .groups = "drop") %>%
  ungroup() %>%
  ggplot(aes(x = factor(step), y = prc, fill = factor(GE_rate))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "step", y = "Precision (prc)", fill = "GE Rate") +
  theme_minimal()
plot_K_rcl = avr_metrics %>%
  dplyr::filter(n_founders == 16) %>%
  dplyr::group_by(GE_rate, K) %>%
  summarise(prc = mean(prc), rcl = mean(rcl), .groups = "drop") %>%
  ungroup() %>%
  ggplot(aes(x = factor(K), y = rcl, fill = factor(GE_rate))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "K", y = "Recall (rcl)", fill = "GE Rate") +
  theme_minimal()
plot_nSnp_rcl = avr_metrics %>%
  dplyr::filter(n_founders == 16) %>%
  dplyr::group_by(GE_rate, nSnp) %>%
  summarise(prc = mean(prc), rcl = mean(rcl), .groups = "drop") %>%
  ungroup() %>%
  ggplot(aes(x = factor(nSnp), y = rcl, fill = factor(GE_rate))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "nSnp", y = "Recall (rcl)", fill = "GE Rate") +
  theme_minimal()
plot_step_rcl = avr_metrics %>%
  dplyr::filter(n_founders == 16) %>%
  dplyr::group_by(GE_rate, step) %>%
  summarise(prc = mean(prc), rcl = mean(rcl), .groups = "drop") %>%
  ungroup() %>%
  ggplot(aes(x = factor(step), y = rcl, fill = factor(GE_rate))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "step", y = "Recall (rcl)", fill = "GE Rate") +
  theme_minimal()
# Combine the plots into a 3x2 grid (adjust to 3x6 as needed)
# combined_plot = (plot_K_prc | plot_nSnp_prc | plot_step_prc) / (plot_K_rcl | plot_nSnp_rcl | plot_step_rcl)
# Display the combined plot
print(plot_K_prc)

print(plot_nSnp_prc)

print(plot_step_prc)

print(plot_K_rcl)

print(plot_nSnp_rcl)

print(plot_step_rcl)

Inspecting best haploRILs parameters

Based on the dotplots and barplots showing combined and individual effects of parameters on haploRILs performance respectively, it seems that the best combination of haploRILs parameters are:

  • nFounders = 16

haploRILs performed clearly better in terms of precision and recall when the actual set of founders was exclusively used. Previous experience with 100 founders showed boosted crossover numbers, which matches the large numbers of detected crossovers under genotyping errors and probably explains the recall gains under genotyping errors.

  • nSnp = 12

12-SNP windows produced consistently better average precision values than bigger windows. Despite recall not being so good at 0% GE rate (since the high recalls at higher rates are associated with numerous false positives), the decision on the parameter winners should be based on precision. Thats because the ML model objective would be to classify crossovers from other loci, so the input to the model must be as accurate as possible in order to detect features associated with recombination frequency. In other words, safe crossovers are better than non-safe crossovers.

  • K = 3

At GE rate 0% the influence of K is negligible, but under realistic scenarios, higher K values improve performance sustantially. I would say that higher K values could be contemplated depending on the CO numbers obtained.

  • step = 1

The new parameter did not produce the results expected, since steps causing window overlap (step = 2) performed consistently worse than the non-overlapping (step = 1). However, the effect could be positive in some scenarios. More overlap also has a sustantial effect on resolution.

winningParameters = c(16, 12, 1, 3)
names(winningParameters) = c("n_founders", "nSnp", "step", "K")
winning_metrics = avr_metrics %>%
  dplyr::filter(
    n_founders == winningParameters["n_founders"],
    nSnp == winningParameters["nSnp"],
    step == winningParameters["step"],
    K == winningParameters["K"]
  )
dt_metrics = winning_metrics %>% dplyr::mutate(dplyr::across(c(co_detected, co_simulated, prc, rcl, f1s, res), ~ round(., 2)))
# Display the interactive table
datatable(dt_metrics, options = list(
  pageLength = 5, # Number of rows per page
  autoWidth = TRUE,
  scrollX = TRUE,
  rownames = FALSE
))
winning_metrics_long = winning_metrics %>%
  pivot_longer(cols = c(prc, rcl), names_to = "Metric", values_to = "Value")

# Create the barplot
ggplot(winning_metrics_long, aes(x = GE_rate, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "GE_rate", y = "Value", title = "Precision and Recall by GE rate") +
  theme_minimal() +
  scale_fill_manual(values = c("prc" = "blue", "rcl" = "red"))

Testing other alternatives

# Step = 2
altParameters = c(16, 12, 2, 3)
names(altParameters) = c("n_founders", "nSnp", "step", "K")
alt_metrics = avr_metrics %>%
  dplyr::filter(
    n_founders == altParameters["n_founders"],
    nSnp == altParameters["nSnp"],
    step == altParameters["step"],
    K == altParameters["K"]
  )
dt_metrics = alt_metrics %>% dplyr::mutate(dplyr::across(c(co_detected, co_simulated, prc, rcl, f1s, res), ~ round(., 2)))
# Display the interactive table
datatable(dt_metrics, options = list(
  pageLength = 5, # Number of rows per page
  autoWidth = TRUE,
  scrollX = TRUE,
  rownames = FALSE
))
avr_metrics_long = alt_metrics %>%
  pivot_longer(cols = c(prc, rcl), names_to = "Metric", values_to = "Value")

# Create the barplot
ggplot(avr_metrics_long, aes(x = GE_rate, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "GE_rate", y = "Value", title = "Precision and Recall by GE rate") +
  theme_minimal() +
  scale_fill_manual(values = c("prc" = "blue", "rcl" = "red"))

# K = 2
altParameters = c(16, 12, 1, 2)
names(altParameters) = c("n_founders", "nSnp", "step", "K")
alt_metrics = avr_metrics %>%
  dplyr::filter(
    n_founders == altParameters["n_founders"],
    nSnp == altParameters["nSnp"],
    step == altParameters["step"],
    K == altParameters["K"]
  )
dt_metrics = alt_metrics %>% dplyr::mutate(dplyr::across(c(co_detected, co_simulated, prc, rcl, f1s, res), ~ round(., 2)))
# Display the interactive table
datatable(dt_metrics, options = list(
  pageLength = 5, # Number of rows per page
  autoWidth = TRUE,
  scrollX = TRUE,
  rownames = FALSE
))
avr_metrics_long = alt_metrics %>%
  pivot_longer(cols = c(prc, rcl), names_to = "Metric", values_to = "Value")

# Create the barplot
ggplot(avr_metrics_long, aes(x = GE_rate, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "GE_rate", y = "Value", title = "Precision and Recall by GE rate") +
  theme_minimal() +
  scale_fill_manual(values = c("prc" = "blue", "rcl" = "red"))